library(DBI)
library(broom)
library(splines)
library(stringr)
library(dplyr)
library(ggplot2)
nhanes.sqlite can download here
or generate by run notebooksource("../6_nhanes_data/phesant.R")
exposure_vars <-
read.delim("../data/select_vars.txt", header = FALSE)$V1
exposure_cols <- paste(exposure_vars, collapse = ", ")
# Load data and convert data types according to PHESEANT.
# Return data set and PHESEANT results.
load_data <- function(exposure_cols) {
nhanes_db <- dbConnect(RSQLite::SQLite(), "../nhanes.sqlite")
cols <-
'BMXWAIST, RIDAGEYR, BMXHT, BMXBMI, BMXWT, RIAGENDR,SDMVPSU,SDMVSTRA,RIDRETH1, years,'
data_sql <-
paste('SELECT', cols, exposure_cols, 'FROM merged_table')
data <- dbGetQuery(nhanes_db, data_sql)
data <- na.omit(data)
dbDisconnect(nhanes_db)
phs_res <- phesant(data)
data[, names(phs_res[phs_res %in% c("ordered", "unordered", "binary")])] <-
lapply(data[, names(phs_res[phs_res %in% c("ordered", "unordered", "binary")])], as.factor)
data[, names(phs_res[phs_res == "continuous"])] <-
lapply(data[, names(phs_res[phs_res == "continuous"])], as.numeric)
return(list(data = data, phs_res = phs_res))
}
data_phs <- load_data(exposure_cols)
data <- data_phs$data
phs_res <- data_phs$phs_res
invNorm <- function(x) {qnorm((rank(x) - 3/8)/(length(x) +1 - 6/8))}
min_max_norm <- function(x) {
(x - min(x)) / (max(x) - min(x))
}
build_formula <- function(base_model,exposure_var,inv=FALSE) {
var <- "+ %s"
if(inv) var <- " + invNorm(%s)"
as.formula(
sprintf(paste0(base_model, var),
exposure_var
)
)
}
enwas <-
function(base_model,
exposure_vars,
data_set,
inv_norm = FALSE) {
num_var <- length(exposure_vars)
model_list <- vector(mode = "list", length = num_var)
num_cols <- sapply(data_set[, exposure_vars], is.numeric)
num_cols <- names(num_cols[num_cols == TRUE])
if (inv_norm) {
data_set[, num_cols] <- lapply(data_set[, num_cols], invNorm)
}
association_list <- data.frame()
factor_terms <- c() # to hold the factors terms
num_var <- length(exposure_vars)
for (i in 1:num_var) {
exposure <- exposure_vars[i]
# print(exposure)
if (is.factor(data_set[, exposure])) {
factor_terms <-
c(factor_terms, paste0(exposure, levels(data_set[, exposure])))
}
model <- build_formula(base_model, exposure)
mod <- lm(model, data_set)
model_list[[i]] <- mod
mod_df <- tidy(mod)
association_list <-
rbind(association_list, mod_df) # step 3b of algorithm
}
# comments the following code out for ignore the categorical phenotypes for now
# xwas_list <-
# association_list[association_list$term %in% c(exposure_vars, factor_terms), ]
xwas_list <-
association_list[association_list$term %in% exposure_vars, ]
# sd_x_list <- sapply(data_set[,num_cols],sd)
sd_x_list <- sapply(data_set, function(x)
sd(as.numeric(x)))
for (var in exposure_vars) {
xwas_list[grepl(var, xwas_list$term), c('estimate', 'std.error')] <-
xwas_list[grepl(var, xwas_list$term), c('estimate', 'std.error')] * sd_x_list[var]
}
xwas_list$fdr <-
p.adjust(xwas_list$p.value, method = 'BY')
xwas_list <- xwas_list |>
mutate(lower = estimate - 1.96 * std.error) |>
mutate(upper = estimate + 1.96 * std.error)
return (list(model_list = model_list, enwas_res = xwas_list))
}
# forest_plot <- function(xwas_result) {
# xwas_result |>
# ggplot(aes(
# x = term,
# y = estimate,
# ymin = lower,
# ymax = upper,
# colour = estimate
# )) +
# geom_pointrange() +
# coord_flip() + # flip coordinates (puts labels on y axis)
# xlab("Exposures") + ylab("Estimate (95% CI)") +
# theme_bw() # use a white background
# }
forest_plot <- function(xwas_result) {
xwas_result$col <- as.numeric(rownames(xwas_result)) %% 2
n <- nrow(xwas_result)
xwas_result |> mutate(xmin = seq(0.5, n - 0.5, by = 1),
xmax = seq(1.5, n + 0.5, by = 1)) |>
ggplot(aes(x = term,
y = estimate,
colour = estimate)) +
geom_point(size = 2, position = position_dodge(width = 1)) +
geom_errorbar(
aes(ymin = lower, ymax = upper),
position = position_dodge(width = 1),
width = 0.5,
cex = 1
) +
geom_hline(yintercept = 0, linetype = 'dashed') +
geom_rect(
aes(
xmin = xmin,
xmax = xmax,
ymin = -Inf,
ymax = +Inf,
fill = factor(col)
),
color = 'black',
alpha = 0.3
) +
scale_fill_manual(values = c('white', 'grey78'), guide = 'none') +
coord_flip() + # flip coordinates (puts labels on y axis)
xlab("Exposures") + ylab("Estimate (95% CI)") +
theme_bw() # use a white background
}
forest_plot_mult <- function(xwas_result_list) {
xwas_result <- do.call("rbind", xwas_result_list)
xwas_result$EnWAS <-
rep(names(xwas_result_list), each = nrow(xwas_result_list[[1]]))
tem_df <- xwas_result_list[[1]]
n <- nrow(tem_df)
tem_df$col <- as.numeric(rownames(tem_df)) %% 2
tem_df <- tem_df|> mutate(xmin = seq(0.5, n - 0.5, by = 1),
xmax = seq(1.5, n + 0.5, by = 1))
xwas_result |> ggplot(aes(
x = term,
y = estimate,
colour = EnWAS
)) +
geom_point(size = 2, position = position_dodge(width = 1)) +
geom_errorbar(aes(ymin = lower, ymax = upper),
position = position_dodge(width = 1), width=0.5,cex=1) +
geom_hline(yintercept = 0, linetype = 'dashed') +
geom_rect(data=tem_df,
aes(
xmin = xmin,
xmax = xmax,
ymin = -Inf,
ymax = +Inf,
fill = factor(col)
),
color = 'black',
alpha = 0.3
) +
scale_fill_manual(values = c('white', 'grey78'), guide = 'none') +
coord_flip() + # flip coordinates (puts labels on y axis)
xlab("Exposures") + ylab("Estimate (95% CI)") +
theme_bw() # use a white background
}
linear_model <- 'BMXWAIST ~ RIDAGEYR*RIAGENDR + BMXBMI'
linear_res2 <- enwas(linear_model, exposure_vars, data)
all_ns_model <-
'BMXWAIST ~ ns(RIDAGEYR, knots = seq(20, 80, by = 10)) * RIAGENDR + ns(BMXBMI,knots = c(seq(15, 45, by = 5),seq(45,65,by=10)))'
all_ns_res <- enwas(all_ns_model, exposure_vars, data)
lm_inverse_res <- enwas(linear_model, exposure_vars, data,inv = TRUE)
ns_inverse_res <- enwas(all_ns_model, exposure_vars, data,inv = TRUE)
forest_plot(linear_res2$enwas_res)
forest_plot_mult(
list(
linear = linear_res2$enwas_res,
ns = all_ns_res$enwas_res
)
)
forest_plot_mult(
list(
lm_inv = lm_inverse_res$enwas_res,
ns_inv = ns_inverse_res$enwas_res
)
)
forest_plot_mult(
list(
linear = linear_res2$enwas_res,
lm_inv = lm_inverse_res$enwas_res
)
)
forest_plot_mult(
list(
ns = all_ns_res$enwas_res,
ns_inv = ns_inverse_res$enwas_res
)
)
xwas_res_list <-
list(
linear = lm_inverse_res$enwas_res,
ns = all_ns_res$enwas_res,
lm_inv = lm_inverse_res$enwas_res,
ns_inv = ns_inverse_res$enwas_res
)
xwas_result <- do.call("rbind", xwas_res_list)
xwas_result$EnWAS <-
rep(names(xwas_res_list), each = nrow(xwas_res_list[[1]]))
xwas_result$postive <- xwas_result$lower * xwas_result$upper
xwas_result$postive <- xwas_result$postive >= 0.0001
# show the false positive raised in linear base model
num_cols <- sapply(data[, exposure_vars], is.numeric)
num_cols <- names(num_cols[num_cols == TRUE])
ns_vs_linear <-
xwas_result[xwas_result$EnWAS == "linear", ]$postive != xwas_result[xwas_result$EnWAS ==
"ns", ]$postive
num_cols[ns_vs_linear]
[1] “DR1TPROT” “DR1TFIBE” “DR1TCRYP” “DR1TNIAC” “DR1TSELE”
for (c in num_cols[ns_vs_linear]) {
g <-
ggplot(data,
aes_string(
x = "RIDAGEYR",
y = paste0("invNorm(", c, ")"),
colour = "RIAGENDR"
)) +
geom_point(alpha = 0.1) +
geom_smooth(
method = lm,
formula = y ~ ns(x, knots = seq(30, 80, by = 10)),
size = 1
)
print(g)
}
# ns vs inverse norm transformation
ns_vs_ns_inv <-
xwas_result[xwas_result$EnWAS == "ns_inv", ]$postive != xwas_result[xwas_result$EnWAS ==
"ns", ]$postive
num_cols[ns_vs_ns_inv]
[1] “DR1TPROT” “DR1TCRYP” “DR1TNIAC” “DR1TVK” “DR1TSELE”
for (c in num_cols[ns_vs_ns_inv]) {
g <-
ggplot(data,
aes_string(
x = "RIDAGEYR",
y = paste0("invNorm(", c, ")"),
colour = "RIAGENDR"
)) +
geom_point(alpha = 0.1) +
geom_smooth(
method = lm,
formula = y ~ ns(x, knots = seq(20, 80, by = 10)),
size = 1
)
print(g)
par(mfrow = c(1, 2))
hist(data[, c], main = c)
hist(invNorm(data[, c]), main = paste0("invNorm:", c))
}
## Warning in predict.lm(model, newdata = new_data_frame(list(x = xseq)), se.fit =
## se, : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(model, newdata = new_data_frame(list(x = xseq)), se.fit =
## se, : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(model, newdata = new_data_frame(list(x = xseq)), se.fit =
## se, : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(model, newdata = new_data_frame(list(x = xseq)), se.fit =
## se, : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(model, newdata = new_data_frame(list(x = xseq)), se.fit =
## se, : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(model, newdata = new_data_frame(list(x = xseq)), se.fit =
## se, : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(model, newdata = new_data_frame(list(x = xseq)), se.fit =
## se, : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(model, newdata = new_data_frame(list(x = xseq)), se.fit =
## se, : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(model, newdata = new_data_frame(list(x = xseq)), se.fit =
## se, : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(model, newdata = new_data_frame(list(x = xseq)), se.fit =
## se, : prediction from a rank-deficient fit may be misleading
enthics <- c('Mexican American','Other Hispanic','Non-Hispanic White','Non-Hispanic Black','Other Race')
lm_base <- lm(as.formula(linear_model),data)
lm_df <- data_frame(residuals=lm_base$residual,ethnic = data$RIDRETH1,gender=data$RIAGENDR)
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
levels(lm_df$ethnic) <- enthics
ggplot(lm_df, aes(x=ethnic, y=residuals, fill=gender)) + geom_boxplot()+
scale_x_discrete(guide = guide_axis(angle = 45))
ns_base <- lm(as.formula(all_ns_model),data)
ns_df <- data_frame(residuals=ns_base$residual,ethnic = data$RIDRETH1,gender=data$RIAGENDR)
levels(ns_df$ethnic) <- enthics
ggplot(ns_df, aes(x=ethnic, y=residuals, fill=gender)) + geom_boxplot()+
scale_x_discrete(guide = guide_axis(angle = 45))